In [1]:
import warnings

from typing import Union

import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from gensim import downloader
from gensim.models import Word2Vec

warnings.filterwarnings('ignore')
In [2]:
# !python -m spacy download en

Параметры¶

In [3]:
file_path = 'parsing_results'
file_name = 'parsing_articles.csv'

Загрузка данных¶

In [4]:
data = pd.read_csv(f'{file_path}/{file_name}')
In [5]:
data
Out[5]:
url deapth title authors source number and pages doi published citation metric abstract references
0 https://dl.acm.org/doi/10.1145/2996913.2996996 0 Demand driven store site selection via multipl... ['Mengwen Xu', 'Tianyi Wang', 'Zhengwei Wu', '... SIGSPACIAL '16: Proceedings of the 24th ACM SI... Article No.: 40, Pages 1 - 10 https://doi.org/10.1145/2996913.2996996 31 October 2016 26 617 Choosing a good location when opening a new st... ['https://dl.acm.org/doi/10.1016/S0305-0548(01...
1 https://dl.acm.org/doi/10.1145/2996913.2996996 1 The generalized maximal covering location problem ['Oded Berman', 'Dmitry Krass'] Computers and Operations Research NaN https://doi.org/10.1016/S0305-0548(01)00079-X 01 May 2002 34 0 We consider a generalization of the maximal co... []
2 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Random Forests ['Leo Breiman'] Machine Learning NaN https://doi.org/10.1023/A:1010933404324 01 October 2001 9,828 0 Random forests are a combination of tree predi... ['https://dl.acm.org/doi/10.1162/neco.1997.9.7...
3 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Efficient algorithms for optimal location quer... ['Zitong Chen', 'Yubao Liu', 'Raymond Chi-Wing... SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... NaN https://doi.org/10.1145/2588555.2612172 18 June 2014 47 790 In this paper, we study the optimal location q... ['https://dl.acm.org/doi/10.14778/2350229.2350...
4 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Mean Shift: A Robust Approach Toward Feature S... ['Dorin Comaniciu', 'Peter Meer'] IEEE Transactions on Pattern Analysis and Mach... NaN https://doi.org/10.1109/34.1000236 01 May 2002 2,062 0 A general nonparametric technique is proposed ... ['https://dl.acm.org/doi/10.1007/BF00128233', ...
... ... ... ... ... ... ... ... ... ... ... ... ...
254 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Geographical topic discovery and comparison ['Zhijun Yin', 'Liangliang Cao', 'Jiawei Han',... WWW '11: Proceedings of the 20th international... NaN https://doi.org/10.1145/1963405.1963443 28 March 2011 232 1,642 This paper studies the problem of discovering ... ['https://dl.acm.org/doi/10.5555/944919.944937...
255 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Driving with knowledge from the physical world ['Jing Yuan', 'Yu Zheng', 'Xing Xie', 'Guangzh... KDD '11: Proceedings of the 17th ACM SIGKDD in... NaN https://doi.org/10.1145/2020408.2020462 21 August 2011 641 2,908 This paper presents a Cloud-based system compu... ['https://dl.acm.org/doi/10.1016/j.eswa.2008.0...
256 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Where to find my next passenger ['Jing Yuan', 'Yu Zheng', 'Liuhang Zhang', 'XI... UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030128 17 September 2011 276 2,024 We present a recommender for taxi drivers and ... ['https://dl.acm.org/doi/10.1145/304182.304187...
257 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Urban computing with taxicabs ['Yu Zheng', 'Yanchi Liu', 'Jing Yuan', 'Xing ... UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030126 17 September 2011 413 3,122 Urban computing for city planning is one of th... ['https://dl.acm.org/doi/10.5555/645484.656550...
258 https://dl.acm.org/doi/10.1145/2487575.2487616 2 NaN [] NaN NaN NaN NaN NaN NaN NaN []

259 rows × 12 columns

Форматирование содержимого столбцов¶

In [6]:
data['authors'] = data['authors'].apply(eval)
data['references'] = data['references'].apply(eval)
In [7]:
data['published'] = pd.to_datetime(data['published'], format='%d %B %Y')
In [8]:
data['citation'] = data['citation'].replace(',', '', regex=True)
data['citation'] = data['citation'].astype('float')
In [9]:
data['metric'] = data['metric'].replace(',', '', regex=True)
data['metric'] = data['metric'].astype('float')
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   url               259 non-null    object        
 1   deapth            259 non-null    int64         
 2   title             230 non-null    object        
 3   authors           259 non-null    object        
 4   source            230 non-null    object        
 5   number and pages  9 non-null      object        
 6   doi               187 non-null    object        
 7   published         230 non-null    datetime64[ns]
 8   citation          232 non-null    float64       
 9   metric            230 non-null    float64       
 10  abstract          223 non-null    object        
 11  references        259 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 24.4+ KB

Просмотр данных¶

In [11]:
data['url'].value_counts(dropna=False)
Out[11]:
url
https://dl.acm.org/doi/10.1145/2996913.2996996    175
https://dl.acm.org/doi/10.1145/2487575.2487616     84
Name: count, dtype: int64
In [12]:
data['deapth'].value_counts(dropna=False)
Out[12]:
deapth
2    230
1     27
0      2
Name: count, dtype: int64
In [13]:
data['title'].value_counts(dropna=False)
Out[13]:
title
NaN                                                                                                    29
Fast training of support vector machines using sequential minimal optimization                          4
The generalized maximal covering location problem                                                       3
Discovering regions of different functions in a city using human mobility and POIs                      3
Support Vector Machines                                                                                 3
                                                                                                       ..
The Cascaded Hough Transform as an Aid in Aerial Image Interpretation                                   1
A new approach to clustering                                                                            1
Pfinder: Real-Time Tracking of the Human Body                                                           1
Region Competition: Unifying Snakes, Region Growing, and Bayes/MDL for Multiband Image Segmentation     1
Urban computing with taxicabs                                                                           1
Name: count, Length: 195, dtype: int64
In [14]:
data['authors'].value_counts(dropna=False)
Out[14]:
authors
[]                                                                                      30
[Thorsten Joachims]                                                                      4
[Kalervo Järvelin, Jaana Kekäläinen]                                                     4
[John C. Platt]                                                                          4
[Daniele Quercia, Neal Lathia, Francesco Calabrese, Giusy Di Lorenzo, Jon Crowcroft]     3
                                                                                        ..
[Christopher Richard Wren, Ali Azarbayejani, Trevor Darrell, Alex Paul Pentland]         1
[Song Chun Zhu, Alan Yuille]                                                             1
[Xinhua Zhuang, Yan Huang, K. Palaniappan, Yunxin Zhao]                                  1
[Zhe Cao, Tao Qin, Tie-Yan Liu, Ming-Feng Tsai, Hang Li]                                 1
[Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie]                                              1
Name: count, Length: 188, dtype: int64
In [15]:
data['source'].value_counts(dropna=False)
Out[15]:
source
NaN                                                                                                                                                        29
IEEE Transactions on Pattern Analysis and Machine Intelligence                                                                                             15
Advances in kernel methods: support vector learning                                                                                                         8
KDD '11: Proceedings of the 17th ACM SIGKDD international conference on Knowledge discovery and data mining                                                 6
ACM Transactions on Information Systems (TOIS)                                                                                                              6
                                                                                                                                                           ..
SIGMOD '03: Proceedings of the 2003 ACM SIGMOD international conference on Management of data                                                               1
AAAI '98/IAAI '98: Proceedings of the fifteenth national/tenth conference on Artificial intelligence/Innovative applications of artificial intelligence     1
SIGMOD '07: Proceedings of the 2007 ACM SIGMOD international conference on Management of data                                                               1
Numerische Mathematik                                                                                                                                       1
SIGSPACIAL '16: Proceedings of the 24th ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems                               1
Name: count, Length: 129, dtype: int64
In [16]:
data['number and pages'].value_counts(dropna=False)
Out[16]:
number and pages
NaN                              250
Article No.: 5, Pages 1 - 44       2
Article No.: 40, Pages 1 - 10      1
Article No.: 85, Pages 1 - 4       1
Article No.: 38, Pages 1 - 55      1
Article No.: 23, Pages 1 - 27      1
Article No.: 29, Pages 1 - 41      1
Article No.: 2, Pages 1 - 29       1
Article No.: 11, Pages 1 - 10      1
Name: count, dtype: int64
In [17]:
data['doi'].value_counts(dropna=False)
Out[17]:
doi
NaN                                              72
https://doi.org/10.1109/ICDM.2010.152             3
https://doi.org/10.1016/S0305-0548(01)00079-X     3
https://doi.org/10.1145/2339530.2339561           3
https://doi.org/10.1109/5254.708428               3
                                                 ..
https://doi.org/10.1109/34.790435                 1
https://doi.org/10.1109/34.88566                  1
https://doi.org/10.1006/cviu.1999.0801            1
https://doi.org/10.1109/83.855433                 1
https://doi.org/10.1145/2030112.2030126           1
Name: count, Length: 161, dtype: int64
In [18]:
data['published'].value_counts(dropna=False)
Out[18]:
published
NaT           29
1999-02-08     8
1998-07-01     6
2011-08-21     6
2002-05-01     4
              ..
2011-12-01     1
2011-10-24     1
2003-09-09     1
2014-09-18     1
2016-10-31     1
Name: count, Length: 151, dtype: int64
In [19]:
data['published'].apply(lambda date: date.replace(day=1, month=1)).value_counts(dropna=False)
Out[19]:
published
NaT           29
2011-01-01    29
1998-01-01    22
1999-01-01    20
2010-01-01    15
2009-01-01    14
2012-01-01    14
2008-01-01    10
2007-01-01    10
2002-01-01    10
2006-01-01     9
1997-01-01     9
2013-01-01     8
2000-01-01     8
2003-01-01     7
2014-01-01     6
1996-01-01     5
1992-01-01     5
2005-01-01     4
1994-01-01     3
1989-01-01     3
2015-01-01     2
2004-01-01     2
1991-01-01     2
1990-01-01     2
2001-01-01     2
1962-01-01     1
1980-01-01     1
1984-01-01     1
2016-01-01     1
1985-01-01     1
1995-01-01     1
1983-01-01     1
1959-01-01     1
1987-01-01     1
Name: count, dtype: int64
In [20]:
data['published'].apply(lambda date: date.replace(day=1, month=1)).value_counts(dropna=True).sort_index().plot();
No description has been provided for this image
In [21]:
data['citation'].sort_values()
Out[21]:
161    1.0
57     3.0
51     3.0
65     3.0
182    4.0
      ... 
228    NaN
229    NaN
242    NaN
252    NaN
258    NaN
Name: citation, Length: 259, dtype: float64
In [22]:
data['citation'].hist();
No description has been provided for this image
In [23]:
data.sort_values(by='citation', ascending=False).head(10)
Out[23]:
url deapth title authors source number and pages doi published citation metric abstract references
237 https://dl.acm.org/doi/10.1145/2487575.2487616 2 The WEKA data mining software: an update [Mark Hall, Eibe Frank, Geoffrey Holmes, Bernh... ACM SIGKDD Explorations Newsletter NaN https://doi.org/10.1145/1656274.1656278 2009-11-16 13690.0 21242.0 More than twelve years have elapsed since the ... [https://dl.acm.org/doi/10.5555/998688.1007097...
2 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Random Forests [Leo Breiman] Machine Learning NaN https://doi.org/10.1023/A:1010933404324 2001-10-01 9828.0 0.0 Random forests are a combination of tree predi... [https://dl.acm.org/doi/10.1162/neco.1997.9.7....
245 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Latent dirichlet allocation [David M. Blei, Andrew Y. Ng, Michael I. Jordan] The Journal of Machine Learning Research NaN NaN 2003-03-01 7820.0 36452.0 We describe latent Dirichlet allocation (LDA),... []
159 https://dl.acm.org/doi/10.1145/2996913.2996996 2 Latent dirichlet allocation [David M. Blei, Andrew Y. Ng, Michael I. Jordan] The Journal of Machine Learning Research NaN NaN 2003-03-01 7820.0 36452.0 We describe latent Dirichlet allocation (LDA),... []
13 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Scikit-learn: Machine Learning in Python [Fabian Pedregosa, Gaël Varoquaux, Alexandre G... The Journal of Machine Learning Research NaN NaN 2011-11-01 7648.0 19849.0 Scikit-learn is a Python module integrating a ... []
94 https://dl.acm.org/doi/10.1145/2996913.2996996 2 A training algorithm for optimal margin classi... [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... COLT '92: Proceedings of the fifth annual work... NaN https://doi.org/10.1145/130385.130401 1992-07-01 6967.0 16072.0 A training algorithm that maximizes the margin... [https://dl.acm.org/doi/10.1162/neco.1989.1.1....
215 https://dl.acm.org/doi/10.1145/2487575.2487616 2 A training algorithm for optimal margin classi... [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... COLT '92: Proceedings of the fifth annual work... NaN https://doi.org/10.1145/130385.130401 1992-07-01 6967.0 16072.0 A training algorithm that maximizes the margin... [https://dl.acm.org/doi/10.1162/neco.1989.1.1....
197 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Authoritative sources in a hyperlinked environ... [Jon M. Kleinberg] Journal of the ACM (JACM) NaN https://doi.org/10.1145/324133.324140 1999-09-01 5920.0 20279.0 The network structure of a hyperlinked environ... []
244 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Probabilistic topic models [David M. Blei] Communications of the ACM NaN https://doi.org/10.1145/2133806.2133826 2012-04-01 3659.0 142464.0 Surveying a suite of algorithms that offer a s... [https://dl.acm.org/doi/10.5555/1795114.179511...
181 https://dl.acm.org/doi/10.1145/2487575.2487616 1 Cumulated gain-based evaluation of IR techniques [Kalervo Järvelin, Jaana Kekäläinen] ACM Transactions on Information Systems (TOIS) NaN https://doi.org/10.1145/582415.582418 2002-10-01 3225.0 9826.0 Modern large retrieval environments tend to ov... []
In [24]:
data['metric'].sort_values()
Out[24]:
129    0.0
110    0.0
105    0.0
104    0.0
202    0.0
      ... 
228    NaN
229    NaN
242    NaN
252    NaN
258    NaN
Name: metric, Length: 259, dtype: float64
In [25]:
data['metric'].hist();
No description has been provided for this image
In [26]:
data.sort_values(by='metric', ascending=False).head(10)
Out[26]:
url deapth title authors source number and pages doi published citation metric abstract references
244 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Probabilistic topic models [David M. Blei] Communications of the ACM NaN https://doi.org/10.1145/2133806.2133826 2012-04-01 3659.0 142464.0 Surveying a suite of algorithms that offer a s... [https://dl.acm.org/doi/10.5555/1795114.179511...
159 https://dl.acm.org/doi/10.1145/2996913.2996996 2 Latent dirichlet allocation [David M. Blei, Andrew Y. Ng, Michael I. Jordan] The Journal of Machine Learning Research NaN NaN 2003-03-01 7820.0 36452.0 We describe latent Dirichlet allocation (LDA),... []
245 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Latent dirichlet allocation [David M. Blei, Andrew Y. Ng, Michael I. Jordan] The Journal of Machine Learning Research NaN NaN 2003-03-01 7820.0 36452.0 We describe latent Dirichlet allocation (LDA),... []
237 https://dl.acm.org/doi/10.1145/2487575.2487616 2 The WEKA data mining software: an update [Mark Hall, Eibe Frank, Geoffrey Holmes, Bernh... ACM SIGKDD Explorations Newsletter NaN https://doi.org/10.1145/1656274.1656278 2009-11-16 13690.0 21242.0 More than twelve years have elapsed since the ... [https://dl.acm.org/doi/10.5555/998688.1007097...
197 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Authoritative sources in a hyperlinked environ... [Jon M. Kleinberg] Journal of the ACM (JACM) NaN https://doi.org/10.1145/324133.324140 1999-09-01 5920.0 20279.0 The network structure of a hyperlinked environ... []
13 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Scikit-learn: Machine Learning in Python [Fabian Pedregosa, Gaël Varoquaux, Alexandre G... The Journal of Machine Learning Research NaN NaN 2011-11-01 7648.0 19849.0 Scikit-learn is a Python module integrating a ... []
94 https://dl.acm.org/doi/10.1145/2996913.2996996 2 A training algorithm for optimal margin classi... [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... COLT '92: Proceedings of the fifth annual work... NaN https://doi.org/10.1145/130385.130401 1992-07-01 6967.0 16072.0 A training algorithm that maximizes the margin... [https://dl.acm.org/doi/10.1162/neco.1989.1.1....
215 https://dl.acm.org/doi/10.1145/2487575.2487616 2 A training algorithm for optimal margin classi... [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... COLT '92: Proceedings of the fifth annual work... NaN https://doi.org/10.1145/130385.130401 1992-07-01 6967.0 16072.0 A training algorithm that maximizes the margin... [https://dl.acm.org/doi/10.1162/neco.1989.1.1....
158 https://dl.acm.org/doi/10.1145/2996913.2996996 2 Trajectory Data Mining: An Overview [Yu Zheng] ACM Transactions on Intelligent Systems and Te... Article No.: 29, Pages 1 - 41 https://doi.org/10.1145/2743025 2015-05-12 1230.0 14146.0 The advances in location-acquisition and mobil... [https://dl.acm.org/doi/10.1109/ICDE.2008.4497...
234 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Algorithm 97: Shortest path [Robert W. Floyd] Communications of the ACM NaN https://doi.org/10.1145/367766.368168 1962-06-01 2828.0 11825.0 NaN []
In [27]:
data['abstract'].value_counts(dropna=False)
Out[27]:
abstract
NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      36
No abstract available.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   29
The development of a city gradually fosters different functional regions, such as educational areas and business districts. In this paper, we propose a framework (titled DRoF) that Discovers Regions of different Functions in a city using both human mobility among regions and points of interests (POIs) located in a region. Specifically, we segment a city into disjointed regions according to major roads, such as highways and urban express ways. We infer the functions of each region using a topic-based inference model, which regards a region as a document, a function as a topic, categories of POIs (e.g., restaurants and shopping malls) as metadata (like authors, affiliations, and key words), and human mobility patterns (when people reach/leave a region and where people come from and leave for) as words. As a result, a region is represented by a distribution of functions, and a function is featured by a distribution of mobility patterns. We further identify the intensity of each function in different locations. The results generated by our framework can benefit a variety of applications, including urban planning, location choosing for a business, and social recommendations. We evaluated our method using large-scale and real-world datasets, consisting of two POI datasets of Beijing (in 2010 and 2011) and two 3-month GPS trajectory datasets (representing human mobility) generated by over 12,000 taxicabs in Beijing in 2010 and 2011 respectively. The results justify the advantages of our approach over baseline methods solely using POIs or human mobility.                                                                                                                                                                                                                                      3
Modern large retrieval environments tend to overwhelm their users by their large output. Since all documents are not of equal relevance to their users, highly relevant documents should be identified and ranked first for presentation. In order to develop IR techniques in this direction, it is necessary to develop evaluation approaches and methods that credit IR methods for their ability to retrieve highly relevant documents. This can be done by extending traditional evaluation methods, that is, recall and precision based on binary relevance judgments, to graded relevance judgments. Alternatively, novel measures based on graded relevance judgments may be developed. This article proposes several novel measures that compute the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. The first one accumulates the relevance scores of retrieved documents along the ranked result list. The second one is similar but applies a discount factor to the relevance scores in order to devaluate late-retrieved documents. The third one computes the relative-to-the-ideal performance of IR techniques, based on the cumulative gain they are able to yield. These novel measures are defined and discussed and their use is demonstrated in a case study using TREC data: sample system run results for 20 queries in TREC-7. As a relevance base we used novel graded relevance judgments on a four-point scale. The test results indicate that the proposed measures credit IR methods for their ability to retrieve highly relevant documents and allow testing of statistical significance of effectiveness differences. The graphs based on the measures also provide insight into the performance IR techniques and allow interpretation, for example, from the user point of view.     3
A city offers thousands of social events a day, and it is difficult for dwellers to make choices. The combination of mobile phones and recommender systems can change the way one deals with such abundance. Mobile phones with positioning technology are now widely available, making it easy for people to broadcast their whereabouts, recommender systems can now identify patterns in people’s movements in order to, for example, recommend events. To do so, the system relies on having mobile users who share their attendance at a large number of social events: cold-start users, who have no location history, cannot receive recommendations. We set out to address the mobile cold-start problem by answering the following research question: how can social events be recommended to a cold-start user based only on his home location? To answer this question, we carry out a study of the relationship between preferences for social events and geography, the first of its kind in a large metropolitan area. We sample location estimations of one million mobile phone users in Greater Boston, combine the sample with social events in the same area, and infer the social events attended by 2,519 residents. Upon this data, we test a variety of algorithms for recommending social events. We find that the most effective algorithm recommends events that are popular among residents of an area. The least effective, instead, recommends events that are geographically close to the area. This last result has interesting implications for location-based services that emphasize recommending nearby events.                                                                                                                                                                                                                        3
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         ..
Cartography and other applications of remote sensing have led to an increased interest in the(semi-)automatic interpretation of structures in aerial images of urban and suburban areas. Although these areas are particularly challenging because of their complexity, the degree of regularity in such man-made structures also helps to tackle the problems. The paper presents the iterated application of the Hough transform as a means to exploit such regularities. It shows how such "Cascaded Hough Transform"(or CHT for short) yields straight lines, vanishing points, and vanishing lines. It also illustrates how the latter assist in improving the precision of the former. The examples are based on real aerial photographs.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           1
Pfinder is a real-time system for tracking people and interpreting their behavior. It runs at 10Hz on a standard SGI Indy computer, and has performed reliably on thousands of people in many different physical locations. The system uses a multiclass statistical model of color and shape to obtain a 2D representation of head and hands in a wide range of viewing conditions. Pfinder has been successfully used in a wide range of applications including wireless interfaces, video databases, and low-bandwidth coding.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         1
We present a novel statistical and variational approach to image segmentation based on a new algorithm named region competition. This algorithm is derived by minimizing a generalized Bayes/MDL criterion using the variational principle. The algorithm is guaranteed to converge to a local minimum and combines aspects of snakes/balloons and region growing. Indeed the classic snakes/balloons and region growing algorithms can be directly derived from our approach. We provide theoretical analysis of region competition including accuracy of boundary location, criteria for initial conditions, and the relationship to edge detection using filters. It is straightforward to generalize the algorithm to multiband segmentation and we demonstrate it on gray level images, color images and texture images. The novel color model allows us to eliminate intensity gradients and shadows, thereby obtaining segmentation based on the albedos of objects. It also helps detect highlight regions.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       1
We present a new approach to the modeling and decomposition of Gaussian mixtures by using robust statistical methods. The mixture distribution is viewed as a contaminated Gaussian density. Using this model and the model-fitting (MF) estimator, we propose a recursive algorithm called the Gaussian mixture density decomposition (GMDD) algorithm for successively identifying each Gaussian component in the mixture. The proposed decomposition scheme has advantages that are desirable but lacking in most existing techniques. In the GMDD algorithm the number of components does not need to be specified a priori, the proportion of noisy data in the mixture can be large, the parameter estimation of each component is virtually initial independent, and the variability in the shape and size of the component densities in the mixture is taken into account. Gaussian mixture density modeling and decomposition has been widely applied in a variety of disciplines that require signal or waveform characterization for classification and recognition. We apply the proposed GMDD algorithm to the identification and extraction of clusters, and the estimation of unknown probability densities. Probability density estimation by identifying a decomposition using the GMDD algorithm, that is, a superposition of normal distributions, is successfully applied to automated cell classification. Computer experiments using both real data and simulated data demonstrate the validity and power of the GMDD algorithm for various models and different noise assumptions                                                                                                                                                                                                                                                                  1
Urban computing for city planning is one of the most significant applications in Ubiquitous computing. In this paper we detect flawed urban planning using the GPS trajectories of taxicabs traveling in urban areas. The detected results consist of 1) pairs of regions with salient traffic problems and 2) the linking structure as well as correlation among them. These results can evaluate the effectiveness of the carried out planning, such as a newly built road and subway lines in a city, and remind city planners of a problem that has not been recognized when they conceive future plans. We conduct our method using the trajectories generated by 30,000 taxis from March to May in 2009 and 2010 in Beijing, and evaluate our results with the real urban planning of Beijing.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      1
Name: count, Length: 172, dtype: int64
In [28]:
data['references'].value_counts(dropna=False)
Out[28]:
references
[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              122
[https://dl.acm.org/doi/10.1145/2133806.2133826, https://dl.acm.org/doi/10.5555/944919.944937, https://dl.acm.org/doi/10.1145/2020408.2020523, https://dl.acm.org/doi/10.1145/1835804.1835918, https://dl.acm.org/doi/10.1109/34.161346, https://dl.acm.org/doi/10.1145/2020408.2020571, https://dl.acm.org/doi/10.1145/2063212.2063223, https://dl.acm.org/doi/10.1016/0377-0427(87)90125-7, https://dl.acm.org/doi/10.5555/558008, https://dl.acm.org/doi/10.1145/1999320.1999331, https://dl.acm.org/doi/10.1145/1963405.1963443, https://dl.acm.org/doi/10.1145/2020408.2020462, https://dl.acm.org/doi/10.1145/2030112.2030128, https://dl.acm.org/doi/10.1145/2030112.2030126, https://dl.acm.org/doi/10.5555/2124413]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      3
[https://dl.acm.org/doi/10.1145/130385.130401, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.1162/089976698300017467, https://dl.acm.org/doi/10.5555/299094, https://dl.acm.org/doi/10.5555/302528.302628, https://dl.acm.org/doi/10.5555/889153, https://dl.acm.org/doi/10.1023/A:1009982220290, https://dl.acm.org/doi/10.5555/645326.649721, https://dl.acm.org/doi/10.1145/288627.288651, https://dl.acm.org/doi/10.5555/576628, https://dl.acm.org/doi/10.5555/299094.299105, https://dl.acm.org/doi/10.1016/0167-8655(94)90027-2, https://dl.acm.org/doi/10.5555/929901, https://dl.acm.org/doi/10.5555/888836, https://dl.acm.org/doi/10.5555/1098680, https://dl.acm.org/doi/10.1023/A:1009715923555, https://dl.acm.org/doi/10.5555/299094.299105, https://dl.acm.org/doi/10.5555/299094.299103, https://dl.acm.org/doi/10.5555/299094.299104]                                                                                                                                                                                                                                                                                                                                                                        3
[https://dl.acm.org/doi/10.5555/303568.303903, https://dl.acm.org/doi/10.5555/525960, https://dl.acm.org/doi/10.5555/646256.684894, https://dl.acm.org/doi/10.1145/130385.130401, https://dl.acm.org/doi/10.5555/3091696.3091706, https://dl.acm.org/doi/10.5555/299094.299100, https://dl.acm.org/doi/10.5555/2998981.2999003, https://dl.acm.org/doi/10.5555/39857, https://dl.acm.org/doi/10.1162/neco.1992.4.1.1, https://dl.acm.org/doi/10.1162/089976698300017269, https://dl.acm.org/doi/10.5555/5509, https://dl.acm.org/doi/10.5555/299094.299103, https://dl.acm.org/doi/10.5555/2980, https://dl.acm.org/doi/10.5555/1196925, https://dl.acm.org/doi/10.5555/646257.685538, https://dl.acm.org/doi/10.5555/794189.794466, https://dl.acm.org/doi/10.5555/148286, https://dl.acm.org/doi/10.5555/646256.684746, https://dl.acm.org/doi/10.5555/302528.302764, https://dl.acm.org/doi/10.1162/089976698300017467, https://dl.acm.org/doi/10.1109/78.650102, https://dl.acm.org/doi/10.1145/238061.238070, https://dl.acm.org/doi/10.1016/S0893-6080(98)00032-X, https://dl.acm.org/doi/10.5555/1098680, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.5555/299094.299099]      2
[https://dl.acm.org/doi/10.1162/neco.1989.1.1.151, https://dl.acm.org/doi/10.5555/109230.109279, https://dl.acm.org/doi/10.1162/neco.1992.4.1.1, https://dl.acm.org/doi/10.5555/118850.118983, https://dl.acm.org/doi/10.5555/1098680]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            2
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ... 
[https://dl.acm.org/doi/10.1109/34.295913, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.1016/1049-9660(91)90028-N, https://dl.acm.org/doi/10.5555/92131, https://dl.acm.org/doi/10.1007/BF00115697, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1109/34.56204, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767505, https://dl.acm.org/doi/10.5555/59551, https://dl.acm.org/doi/10.1007/BF00137441, https://dl.acm.org/doi/10.1137/0731015, https://dl.acm.org/doi/10.1109/34.50626, https://dl.acm.org/doi/10.1007/BF01679685, https://dl.acm.org/doi/10.5555/193183, https://dl.acm.org/doi/10.1007/BF00127812, https://dl.acm.org/doi/10.1016/0005-1098(78)90005-5, https://dl.acm.org/doi/10.5555/534247, https://dl.acm.org/doi/10.1007/BF01427153, https://dl.acm.org/doi/10.1016/0031-3203(89)90010-1, https://dl.acm.org/doi/10.5555/889385, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767599, https://dl.acm.org/doi/10.5555/247372.247375, https://dl.acm.org/doi/10.5555/794190.794527]                                                                                                                                         1
[https://dl.acm.org/doi/10.1109/34.387503, https://dl.acm.org/doi/10.1109/34.334396, https://dl.acm.org/doi/10.1109/34.391395, https://dl.acm.org/doi/10.1007/s005300050046, https://dl.acm.org/doi/10.1109/34.216727, https://dl.acm.org/doi/10.1109/34.85661, https://dl.acm.org/doi/10.5555/200241.200246, https://dl.acm.org/doi/10.1109/34.531801]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           1
[https://dl.acm.org/doi/10.1109/TPAMI.1986.4767747, https://dl.acm.org/doi/10.1109/TPAMI.1987.4767980, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.5555/59861, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767475, https://dl.acm.org/doi/10.5555/6519, https://dl.acm.org/doi/10.5555/1095712, https://dl.acm.org/doi/10.1016/S0734-189X(87)80181-0, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767841, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767852, https://dl.acm.org/doi/10.1109/2.74, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767748]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      1
[https://dl.acm.org/doi/10.1109/TPAMI.1986.4767749, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.1109/34.6782, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1016/S0734-189X(87)80153-6, https://dl.acm.org/doi/10.5555/1623516.1623607, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767748]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  1
[https://dl.acm.org/doi/10.5555/645484.656550, https://dl.acm.org/doi/10.1145/1864349.1864380, https://dl.acm.org/doi/10.1007/s00779-005-0046-3, https://dl.acm.org/doi/10.1109/CSE.2009.91, https://dl.acm.org/doi/10.1145/1835804.1835918, https://dl.acm.org/doi/10.1109/MPRV.2007.57, https://dl.acm.org/doi/10.1145/1463434.1463477, https://dl.acm.org/doi/10.1145/321556.321570, https://dl.acm.org/doi/10.1109/MC.2006.308, https://dl.acm.org/doi/10.1145/1869790.1869807, https://dl.acm.org/doi/10.1145/1409635.1409678, https://dl.acm.org/doi/10.1145/1921591.1921596]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               1
Name: count, Length: 122, dtype: int64

Очистка данных¶

In [29]:
data = data[data['title'].notna()]

data.reset_index(drop=True, inplace=True)
In [30]:
data = data[data['authors'].apply(len) > 0]

data.reset_index(drop=True, inplace=True)
In [31]:
data = data[data['source'].notna()]

data.reset_index(drop=True, inplace=True)
In [32]:
data = data[data['published'].notna()]

data.reset_index(drop=True, inplace=True)
In [33]:
data = data[(data['abstract'].notna()) & (data['abstract'] != 'No abstract available.')]

data.reset_index(drop=True, inplace=True)
In [34]:
data = data.drop_duplicates(
    subset=[
        'title',
        # 'authors',
        'source',
        'number and pages',
        'doi',
        'published',
        'citation',
        'metric',
        'abstract',
        # 'references'        
    ]
)

data.reset_index(drop=True, inplace=True)
In [35]:
data
Out[35]:
url deapth title authors source number and pages doi published citation metric abstract references
0 https://dl.acm.org/doi/10.1145/2996913.2996996 0 Demand driven store site selection via multipl... [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... SIGSPACIAL '16: Proceedings of the 24th ACM SI... Article No.: 40, Pages 1 - 10 https://doi.org/10.1145/2996913.2996996 2016-10-31 26.0 617.0 Choosing a good location when opening a new st... [https://dl.acm.org/doi/10.1016/S0305-0548(01)...
1 https://dl.acm.org/doi/10.1145/2996913.2996996 1 The generalized maximal covering location problem [Oded Berman, Dmitry Krass] Computers and Operations Research NaN https://doi.org/10.1016/S0305-0548(01)00079-X 2002-05-01 34.0 0.0 We consider a generalization of the maximal co... []
2 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Random Forests [Leo Breiman] Machine Learning NaN https://doi.org/10.1023/A:1010933404324 2001-10-01 9828.0 0.0 Random forests are a combination of tree predi... [https://dl.acm.org/doi/10.1162/neco.1997.9.7....
3 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Efficient algorithms for optimal location quer... [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... NaN https://doi.org/10.1145/2588555.2612172 2014-06-18 47.0 790.0 In this paper, we study the optimal location q... [https://dl.acm.org/doi/10.14778/2350229.23502...
4 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Mean Shift: A Robust Approach Toward Feature S... [Dorin Comaniciu, Peter Meer] IEEE Transactions on Pattern Analysis and Mach... NaN https://doi.org/10.1109/34.1000236 2002-05-01 2062.0 0.0 A general nonparametric technique is proposed ... [https://dl.acm.org/doi/10.1007/BF00128233, ht...
... ... ... ... ... ... ... ... ... ... ... ... ...
164 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Machine learning approaches for high-resolutio... [Ranga Raju Vatsavai, Eddie Bright, Chandola V... COM.Geo '11: Proceedings of the 2nd Internatio... Article No.: 11, Pages 1 - 10 https://doi.org/10.1145/1999320.1999331 2011-05-23 18.0 526.0 The proliferation of several machine learning ... [https://dl.acm.org/doi/10.5555/1191551.119179...
165 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Geographical topic discovery and comparison [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... WWW '11: Proceedings of the 20th international... NaN https://doi.org/10.1145/1963405.1963443 2011-03-28 232.0 1642.0 This paper studies the problem of discovering ... [https://dl.acm.org/doi/10.5555/944919.944937,...
166 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Driving with knowledge from the physical world [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] KDD '11: Proceedings of the 17th ACM SIGKDD in... NaN https://doi.org/10.1145/2020408.2020462 2011-08-21 641.0 2908.0 This paper presents a Cloud-based system compu... [https://dl.acm.org/doi/10.1016/j.eswa.2008.07...
167 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Where to find my next passenger [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030128 2011-09-17 276.0 2024.0 We present a recommender for taxi drivers and ... [https://dl.acm.org/doi/10.1145/304182.304187,...
168 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Urban computing with taxicabs [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030126 2011-09-17 413.0 3122.0 Urban computing for city planning is one of th... [https://dl.acm.org/doi/10.5555/645484.656550,...

169 rows × 12 columns

In [36]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   url               169 non-null    object        
 1   deapth            169 non-null    int64         
 2   title             169 non-null    object        
 3   authors           169 non-null    object        
 4   source            169 non-null    object        
 5   number and pages  8 non-null      object        
 6   doi               142 non-null    object        
 7   published         169 non-null    datetime64[ns]
 8   citation          169 non-null    float64       
 9   metric            169 non-null    float64       
 10  abstract          169 non-null    object        
 11  references        169 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 16.0+ KB

Предобработка аннотаций¶

Лемматизация — приведение словоформы к лемме — её нормальной (словарной) форме. Например: существительные в форму единственного числа, именительного падежа.

Стемминг — нахождение основы слова для заданного исходного слова. Например: выделение корня слова.

In [37]:
nlp = spacy.load('en_core_web_sm')
In [38]:
def process_text(text: str) -> str:

    text = text.lower()

    digits = '0123456789'

    for digit in digits:
        text = text.replace(digit, '')

    tokens = nlp(text)

    tokens = [token for token in tokens if token.is_punct is False and token.is_stop is False]
    tokens = [token.lemma_ for token in tokens]

    result = ' '.join(tokens)
    
    return result
In [39]:
data['abstract'][0]
Out[39]:
'Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.'
In [40]:
process_text(data['abstract'][0])
Out[40]:
'choose good location open new store crucial future success business traditional method include offline manual survey analytic model base census datum unable adapt dynamic market time consume rapid increase availability big datum type mobile device online query datum offline positioning datum provide possibility develop automatic accurate data- drive prediction model business store site selection paper propose demand drive store site selection dd framework business store site selection mining search query datum baidu map dd detect spatial temporal distribution customer demand different business service query datum baidu map large online map search engine china detect gap demand supply determine candidate location cluster gap final stage solve location optimization problem predict rank number customer deploy supervised regression model predict number customer use learn rank model directly rank location evaluate framework type business real world case experiment result demonstrate effectiveness method dd core function store site selection implement core component business analytic platform potentially chain store merchant baidu nuomi'
In [41]:
data['process_abstract'] = data['abstract'].apply(lambda row: process_text(row))
In [42]:
data
Out[42]:
url deapth title authors source number and pages doi published citation metric abstract references process_abstract
0 https://dl.acm.org/doi/10.1145/2996913.2996996 0 Demand driven store site selection via multipl... [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... SIGSPACIAL '16: Proceedings of the 24th ACM SI... Article No.: 40, Pages 1 - 10 https://doi.org/10.1145/2996913.2996996 2016-10-31 26.0 617.0 Choosing a good location when opening a new st... [https://dl.acm.org/doi/10.1016/S0305-0548(01)... choose good location open new store crucial fu...
1 https://dl.acm.org/doi/10.1145/2996913.2996996 1 The generalized maximal covering location problem [Oded Berman, Dmitry Krass] Computers and Operations Research NaN https://doi.org/10.1016/S0305-0548(01)00079-X 2002-05-01 34.0 0.0 We consider a generalization of the maximal co... [] consider generalization maximal cover location...
2 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Random Forests [Leo Breiman] Machine Learning NaN https://doi.org/10.1023/A:1010933404324 2001-10-01 9828.0 0.0 Random forests are a combination of tree predi... [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... random forest combination tree predictor tree ...
3 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Efficient algorithms for optimal location quer... [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... NaN https://doi.org/10.1145/2588555.2612172 2014-06-18 47.0 790.0 In this paper, we study the optimal location q... [https://dl.acm.org/doi/10.14778/2350229.23502... paper study optimal location query problem bas...
4 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Mean Shift: A Robust Approach Toward Feature S... [Dorin Comaniciu, Peter Meer] IEEE Transactions on Pattern Analysis and Mach... NaN https://doi.org/10.1109/34.1000236 2002-05-01 2062.0 0.0 A general nonparametric technique is proposed ... [https://dl.acm.org/doi/10.1007/BF00128233, ht... general nonparametric technique propose analys...
... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Machine learning approaches for high-resolutio... [Ranga Raju Vatsavai, Eddie Bright, Chandola V... COM.Geo '11: Proceedings of the 2nd Internatio... Article No.: 11, Pages 1 - 10 https://doi.org/10.1145/1999320.1999331 2011-05-23 18.0 526.0 The proliferation of several machine learning ... [https://dl.acm.org/doi/10.5555/1191551.119179... proliferation machine learning approach make d...
165 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Geographical topic discovery and comparison [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... WWW '11: Proceedings of the 20th international... NaN https://doi.org/10.1145/1963405.1963443 2011-03-28 232.0 1642.0 This paper studies the problem of discovering ... [https://dl.acm.org/doi/10.5555/944919.944937,... paper study problem discover compare geographi...
166 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Driving with knowledge from the physical world [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] KDD '11: Proceedings of the 17th ACM SIGKDD in... NaN https://doi.org/10.1145/2020408.2020462 2011-08-21 641.0 2908.0 This paper presents a Cloud-based system compu... [https://dl.acm.org/doi/10.1016/j.eswa.2008.07... paper present cloud base system computing cust...
167 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Where to find my next passenger [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030128 2011-09-17 276.0 2024.0 We present a recommender for taxi drivers and ... [https://dl.acm.org/doi/10.1145/304182.304187,... present recommender taxi driver people expect ...
168 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Urban computing with taxicabs [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030126 2011-09-17 413.0 3122.0 Urban computing for city planning is one of th... [https://dl.acm.org/doi/10.5555/645484.656550,... urban computing city planning significant appl...

169 rows × 13 columns

Векторизация¶

Кодирование по наличию слова в тексте (Flag):

Тексты разбиваются на слова. Далее каждому тексту сопоставляется словарь уникальных слов во всём тексте. Если какое-то слово из словаря встречается в опредленном тексте, то ему проставляется 1, в противном случае 0.

Плюсы:

  • Простота использования.
  • Скорость генерации вектора.

Минусы:

  • Не учитывает порядок слов.
  • Не учитывает частотность слов.
  • Не учитывает совстречаемость слов.
  • Редкие слова могут быть ветеснены словами, которые очень часто встречаются в любых текстах.

Таким образом, вектора для каждого из текстов представлют собой наборы 0 и 1.

Мешок слов (Bag of Words):

В этом случае текст представляется в виде «мешка» из разных слов. Порядок этих слов игнорируется — важна только частота, с которой они встречаются. Для каждого текста создается вектор, где каждый элемент описывает количество вхождений определенного слова из словаря.

Плюсы:

  • Простота использования.
  • Скорость генерации вектора.

Минусы:

  • Не учитывает порядок слов.
  • Не учитывает совстречаемость слов.
  • Редкие слова могут быть ветеснены словами, которые очень часто встречаются в любых текстах.

Таким образом, вектора для каждого из текстов представлют собой словари с указанием количества упоминаний слова в нём.

Term frequency - Inverse document frequency (TF-IDF):

Это числовой статистический показатель, который отражает важность слова для документа. Формально TF-IDF определяется так:

$$tf = \frac{\text{Частотность слова в документе}}{\text{Общее количество слов в документе}}$$

$$idf = \frac{\text{Количество документов со словом}}{\text{Общее количество документов}}$$

$$\text{tf-idf} = \text{tf} \times \text{idf}$$

Плюсы:

  • Простота использования.
  • Скорость генерации вектора.
  • По-умолчанию определен на уровне документов.
  • Учитывает относительную встречаемость слова.
  • Учитывает безусловную частотность/редкость слова.

Минусы:

  • Не учитывает порядок слов.
  • Не учитывает совстречаемость слов.

Таким образом, показатель TF-IDF учитывает значимость слова — чем выше показатель, тем важнее слово.

Базовые подходы¶

In [43]:
def vectorization(texts: Union[list[str], pd.Series, pd.DataFrame], vectorizer_name: str) -> pd.DataFrame():

    if vectorizer_name == 'CountVectorizer':
        vectorizer = CountVectorizer()
    elif vectorizer_name == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError(f'Неизвестный векторизатор: {vectorizer_name}!')

    vectors = vectorizer.fit_transform(texts)
    vectors = pd.DataFrame(data=vectors.toarray(), columns=vectorizer.get_feature_names_out())

    return vectors
In [44]:
count_vectorization = vectorization(
    texts=data['process_abstract'], 
    vectorizer_name='CountVectorizer',
)
In [45]:
flag_vectorization = count_vectorization.mask(count_vectorization > 1, 1)
In [46]:
tfidf_vectorization = vectorization(
    texts=data['process_abstract'], 
    vectorizer_name='TfidfVectorizer',
)
In [47]:
count_vectorization
Out[47]:
ability able abnormal absence absorb abundance abundant academia academic accept ... workload workstation world write xor year yield york zero zone
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
165 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
166 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
167 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
168 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

169 rows × 2592 columns

In [48]:
flag_vectorization
Out[48]:
ability able abnormal absence absorb abundance abundant academia academic accept ... workload workstation world write xor year yield york zero zone
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
165 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
166 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
167 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
168 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

169 rows × 2592 columns

In [49]:
tfidf_vectorization
Out[49]:
ability able abnormal absence absorb abundance abundant academia academic accept ... workload workstation world write xor year yield york zero zone
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.041268 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.072373 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.136196 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
165 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
166 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.084571 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
167 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0
168 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0

169 rows × 2592 columns

In [50]:
count_vectorization.loc[0].sort_values(ascending=False)
Out[50]:
store        6
business     6
datum        6
selection    4
site         4
            ..
forefront    0
forest       0
forgame      0
forge        0
zone         0
Name: 0, Length: 2592, dtype: int64
In [51]:
flag_vectorization.loc[0].sort_values(ascending=False)
Out[51]:
positioning    1
datum          1
rank           1
determine      1
method         1
              ..
forefront      0
forest         0
forgame        0
forge          0
zone           0
Name: 0, Length: 2592, dtype: int64
In [52]:
tfidf_vectorization.loc[0].sort_values(ascending=False)
Out[52]:
business     0.318310
store        0.311331
site         0.243536
baidu        0.237264
dd           0.237264
               ...   
forefront    0.000000
forest       0.000000
forgame      0.000000
forge        0.000000
zone         0.000000
Name: 0, Length: 2592, dtype: float64

Нейросетевые алгоритмы¶

Эмбеддинги — самая популярная технология в NLP и не только. Эмбеддинг — это представление слова/текста/картинки и т.д. в виде вектора низкой размерности. Если объекты для которых получены эмбеддинги близки по смыслу, их векторы будут также похожи. Чтобы добиться такого результата, эмбеддинги обучают на больших массивах текстов с использованием нейронных сетей. У этого подхода много преимуществ по сравнению с другими:

Плюсы:

  • Эмбеддинги способны улавливать семантические отношения между словами.
  • Низкая размерность векторов.
  • Возможность преобразования эмбеддингов с помощью векторных операций и получение осознанных и логичных результатов.

Минусы:

  • Сложность реализации.
  • Необходимость болших вычислительных мощностей.
  • Сильная зависимость от качества и количества обучающих данных.

В качестве основных моделей и подходов для генерации эмбеддингов можно выделить следующие:

  • Word2Vec. Векторное представление основывается на контекстной близости: слова, встречающиеся в тексте рядом с одинаковыми словами (а следовательно, имеющие схожий смысл), будут иметь близкие (по косинусному расстоянию) векторы.
  • GloVe (Global Vectors for Word Representation). Этот метод основан на матрице совместной встречаемости слов в корпусе текстов. GloVe пытается объединить преимущества двух подходов к моделированию слов: матричной факторизации и предсказательных моделей вроде word2vec, стремясь к получению более точных векторных представлений.
  • fastText. Разработанный в Facebook Research, fastText улучшает word2vec за счет обработки целых слов и подсловных единиц (например, n-грамм). Это позволяет fastText генерировать вектора для слов, отсутствующих в обучающем наборе, что является значительным преимуществом для языков с богатой морфологией.
  • ELMo (Embeddings from Language Models). ELMo использует модели на основе двунаправленных LSTM (Long Short-Term Memory) сетей для генерации контекстно-зависимых векторных представлений слов. Эти представления богаты на семантическую и синтаксическую информацию, благодаря чему модель более эффективно работает с полисемией и другими языковыми нюансами.
  • BERT (Bidirectional Encoder Representations from Transformers). BERT представляет собой революцию в NLP благодаря своей способности обрабатывать слова в контексте всего предложения с обеих сторон (слева направо и справа налево одновременно). Это достигается за счет использования архитектуры Transformer, что позволяет модели лучше понимать контекст и нюансы языка.
  • GPT (Generative Pre-trained Transformer). Серия моделей GPT начинается с предобученного на большом корпусе текстов Transformer, который затем может быть дообучен на конкретной задаче NLP. Благодаря мощности и гибкости, модели GPT показывают выдающиеся результаты во многих задачах, включая генерацию текста, перевод, ответы на вопросы и многие другие.

Подробнее остановимся на самом первом алгоритме для генерации эмбеддингов - Word2Vec.

Word2Vec¶

Word2Vec — это популярная модель обучения вложений слов, предложенная исследователями компании Google в 2013 году. Она позволяет преобразовать слова из корпуса текстов в векторы чисел таким образом, что слова с похожими семантическими значениями имеют близкие векторные представления в многомерном пространстве.

Уже обученная модель на новостях от Google c длиной эмбеддингов 300.

In [53]:
gensim_word2vec = downloader.load('word2vec-google-news-300') 
In [54]:
len(gensim_word2vec.index_to_key)
Out[54]:
3000000
In [55]:
gensim_word2vec.word_vec('king')
Out[55]:
array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  2.01416016e-02, -1.63085938e-01,  1.35803223e-03,
       -1.44531250e-01, -5.68847656e-02,  4.29687500e-02, -2.46582031e-02,
        1.85546875e-01,  4.47265625e-01,  9.58251953e-03,  1.31835938e-01,
        9.86328125e-02, -1.85546875e-01, -1.00097656e-01, -1.33789062e-01,
       -1.25000000e-01,  2.83203125e-01,  1.23046875e-01,  5.32226562e-02,
       -1.77734375e-01,  8.59375000e-02, -2.18505859e-02,  2.05078125e-02,
       -1.39648438e-01,  2.51464844e-02,  1.38671875e-01, -1.05468750e-01,
        1.38671875e-01,  8.88671875e-02, -7.51953125e-02, -2.13623047e-02,
        1.72851562e-01,  4.63867188e-02, -2.65625000e-01,  8.91113281e-03,
        1.49414062e-01,  3.78417969e-02,  2.38281250e-01, -1.24511719e-01,
       -2.17773438e-01, -1.81640625e-01,  2.97851562e-02,  5.71289062e-02,
       -2.89306641e-02,  1.24511719e-02,  9.66796875e-02, -2.31445312e-01,
        5.81054688e-02,  6.68945312e-02,  7.08007812e-02, -3.08593750e-01,
       -2.14843750e-01,  1.45507812e-01, -4.27734375e-01, -9.39941406e-03,
        1.54296875e-01, -7.66601562e-02,  2.89062500e-01,  2.77343750e-01,
       -4.86373901e-04, -1.36718750e-01,  3.24218750e-01, -2.46093750e-01,
       -3.03649902e-03, -2.11914062e-01,  1.25000000e-01,  2.69531250e-01,
        2.04101562e-01,  8.25195312e-02, -2.01171875e-01, -1.60156250e-01,
       -3.78417969e-02, -1.20117188e-01,  1.15234375e-01, -4.10156250e-02,
       -3.95507812e-02, -8.98437500e-02,  6.34765625e-03,  2.03125000e-01,
        1.86523438e-01,  2.73437500e-01,  6.29882812e-02,  1.41601562e-01,
       -9.81445312e-02,  1.38671875e-01,  1.82617188e-01,  1.73828125e-01,
        1.73828125e-01, -2.37304688e-01,  1.78710938e-01,  6.34765625e-02,
        2.36328125e-01, -2.08984375e-01,  8.74023438e-02, -1.66015625e-01,
       -7.91015625e-02,  2.43164062e-01, -8.88671875e-02,  1.26953125e-01,
       -2.16796875e-01, -1.73828125e-01, -3.59375000e-01, -8.25195312e-02,
       -6.49414062e-02,  5.07812500e-02,  1.35742188e-01, -7.47070312e-02,
       -1.64062500e-01,  1.15356445e-02,  4.45312500e-01, -2.15820312e-01,
       -1.11328125e-01, -1.92382812e-01,  1.70898438e-01, -1.25000000e-01,
        2.65502930e-03,  1.92382812e-01, -1.74804688e-01,  1.39648438e-01,
        2.92968750e-01,  1.13281250e-01,  5.95703125e-02, -6.39648438e-02,
        9.96093750e-02, -2.72216797e-02,  1.96533203e-02,  4.27246094e-02,
       -2.46093750e-01,  6.39648438e-02, -2.25585938e-01, -1.68945312e-01,
        2.89916992e-03,  8.20312500e-02,  3.41796875e-01,  4.32128906e-02,
        1.32812500e-01,  1.42578125e-01,  7.61718750e-02,  5.98144531e-02,
       -1.19140625e-01,  2.74658203e-03, -6.29882812e-02, -2.72216797e-02,
       -4.82177734e-03, -8.20312500e-02, -2.49023438e-02, -4.00390625e-01,
       -1.06933594e-01,  4.24804688e-02,  7.76367188e-02, -1.16699219e-01,
        7.37304688e-02, -9.22851562e-02,  1.07910156e-01,  1.58203125e-01,
        4.24804688e-02,  1.26953125e-01,  3.61328125e-02,  2.67578125e-01,
       -1.01074219e-01, -3.02734375e-01, -5.76171875e-02,  5.05371094e-02,
        5.26428223e-04, -2.07031250e-01, -1.38671875e-01, -8.97216797e-03,
       -2.78320312e-02, -1.41601562e-01,  2.07031250e-01, -1.58203125e-01,
        1.27929688e-01,  1.49414062e-01, -2.24609375e-02, -8.44726562e-02,
        1.22558594e-01,  2.15820312e-01, -2.13867188e-01, -3.12500000e-01,
       -3.73046875e-01,  4.08935547e-03,  1.07421875e-01,  1.06933594e-01,
        7.32421875e-02,  8.97216797e-03, -3.88183594e-02, -1.29882812e-01,
        1.49414062e-01, -2.14843750e-01, -1.83868408e-03,  9.91210938e-02,
        1.57226562e-01, -1.14257812e-01, -2.05078125e-01,  9.91210938e-02,
        3.69140625e-01, -1.97265625e-01,  3.54003906e-02,  1.09375000e-01,
        1.31835938e-01,  1.66992188e-01,  2.35351562e-01,  1.04980469e-01,
       -4.96093750e-01, -1.64062500e-01, -1.56250000e-01, -5.22460938e-02,
        1.03027344e-01,  2.43164062e-01, -1.88476562e-01,  5.07812500e-02,
       -9.37500000e-02, -6.68945312e-02,  2.27050781e-02,  7.61718750e-02,
        2.89062500e-01,  3.10546875e-01, -5.37109375e-02,  2.28515625e-01,
        2.51464844e-02,  6.78710938e-02, -1.21093750e-01, -2.15820312e-01,
       -2.73437500e-01, -3.07617188e-02, -3.37890625e-01,  1.53320312e-01,
        2.33398438e-01, -2.08007812e-01,  3.73046875e-01,  8.20312500e-02,
        2.51953125e-01, -7.61718750e-02, -4.66308594e-02, -2.23388672e-02,
        2.99072266e-02, -5.93261719e-02, -4.66918945e-03, -2.44140625e-01,
       -2.09960938e-01, -2.87109375e-01, -4.54101562e-02, -1.77734375e-01,
       -2.79296875e-01, -8.59375000e-02,  9.13085938e-02,  2.51953125e-01],
      dtype=float32)
In [56]:
gensim_word2vec.word_vec('king').shape
Out[56]:
(300,)

Визуализация эмбеддингов для 3 слов: king, man, woman. На рисунках ниже видно, что линии из квадратов для мужчины и женщины сильнее похожи друг на друга, нежели чем мужчина на короля, что демонстирует смысловую разницу этих определений.

In [57]:
fig, axes = plt.subplots(figsize=(15, 7.5), nrows=6)

step = 50

for idx, ax in enumerate(axes):

    sns.heatmap(
        data=np.array([
            gensim_word2vec.word_vec('king')[idx*step:(idx+1)*step],
            gensim_word2vec.word_vec('man')[idx*step:(idx+1)*step],
            gensim_word2vec.word_vec('woman')[idx*step:(idx+1)*step],
        ]),
        linewidths=0.1,
        xticklabels=[],
        yticklabels=['king', 'man', 'woman'],
        cmap='coolwarm',
        cbar=False,
        square=True,
        ax=ax,
    );
No description has been provided for this image
In [58]:
local = pd.DataFrame(
    data={
        'king': gensim_word2vec.word_vec('king'),
        'man': gensim_word2vec.word_vec('man'),
        'woman': gensim_word2vec.word_vec('woman'),
    },
)

local.corr()
Out[58]:
king man woman
king 1.000000 0.231538 0.129787
man 0.231538 1.000000 0.765997
woman 0.129787 0.765997 1.000000
In [59]:
gensim_word2vec.most_similar('king')
Out[59]:
[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]
In [60]:
gensim_word2vec.similarity('king', 'queen')
Out[60]:
0.6510957
In [61]:
gensim_word2vec.similarity('king', 'man')
Out[61]:
0.22942673
In [62]:
gensim_word2vec.most_similar(
    positive=['king', 'woman'], 
    negative=['man']
)
Out[62]:
[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

Визуализация отношений между странами и их столицами в формате эмбеддингов пропущенных через метод главных компонент (PCA) для сокращения размерности и возможности отображения на двумерной плоскости.

image.png

In [63]:
gensim_word2vec.similarity('russia', 'moscow')
Out[63]:
0.5842015
In [64]:
gensim_word2vec.similarity('usa', 'moscow')
Out[64]:
0.4821158
In [65]:
gensim_word2vec.most_similar(
    positive=['russia', 'tokyo'], 
    negative=['moscow']
)
Out[65]:
[('japan', 0.5613622665405273),
 ('asia', 0.5138392448425293),
 ('korea', 0.5004267692565918),
 ('washington', 0.4972049295902252),
 ('murdoch', 0.49687281250953674),
 ('korean', 0.4891359508037567),
 ('north_korea', 0.48261064291000366),
 ('japanese', 0.48163819313049316),
 ('obj', 0.4747077524662018),
 ('south_korea', 0.47251230478286743)]

Векторизуем наши аннотации статей с помощью Word2Vec. Основной проблемой здесь является то, что эта модель может преобразовывать только слова в векторы, а нам бы хотелось сделать аналогичную операцию, но с текстами. Для этого воспользуемся некоторым допущением и будем считать, что конкретный текст может быть представлен усредненным набором эмбеддингов слов, из которых он состоит.

In [66]:
word2vec_vectorization = flag_vectorization.copy()

for column in word2vec_vectorization:

    if column not in gensim_word2vec.index_to_key:
        word2vec_vectorization[column] = 0
        continue
    
    embedding = gensim_word2vec.word_vec(column)
    embedding = np.mean(embedding)

    word2vec_vectorization[column] = word2vec_vectorization[column].mask(word2vec_vectorization[column] == 1, embedding)
In [67]:
word2vec_vectorization
Out[67]:
ability able abnormal absence absorb abundance abundant academia academic accept ... workload workstation world write xor year yield york zero zone
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 -0.013312 0.0 0 0.0 0.000000 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 -0.000551 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.005286 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
165 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
166 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 -0.013312 0.0 0 0.0 0.000000 0.0 0.0 0.0
167 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0
168 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.000000 0.0 0 0.0 0.000000 0.0 0.0 0.0

169 rows × 2592 columns

Поиск похожих статей¶

Интересный факт:

Вспомним определение корреляции Пирсона.

$\rho = \frac{\text{cov}(x, y)}{\sigma (x) \sigma (y)} = \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^{N} (x_i - \bar{x})^2 \sum_{i=1}^{N} (y_i - \bar{y})^2}}$

Вспомним составляющие его части.

  • $\text{cov}(x, y) = \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})}{N - 1}$
  • $\sigma (x) = \sqrt{\frac{\sum_{i=1}^{N} (x_i - \bar{x})^2}{N - 1}}$

Что-то отдаленно напоминает...

Посмотрим поближе на скалярное произведение векторов.

$\langle x, y \rangle = \sum_{i=1}^{N} x_i y_i = \| x \|_{2} \| y \|_{2} \cos \alpha \to \cos \alpha = \frac{\sum_{i=1}^{N} x_i y_i}{\| x \|_{2} \| y \|_{2}}$

Начинает вырисовываться некоторое сходство. Наконец, вспомим определение нормы в Евклидовом пространстве.

$\| x \|_{2} = \sqrt{\sum_{i=1}^{N} x_i^2}$

Таким образом имеем следующее: $\cos \alpha = \frac{\sum_{i=1}^{N} x_i y_i}{\sqrt{\sum_{i=1}^{N} x_i^2 \sum_{i=1}^{N} y_i^2}}$.

Получается, что корреляция Пирсона это ничто иное, как косинус угла между децентрированными векторами. Более того, в случае, когда векторы не просто децентрированны, но и нормирован, все три метрики — скалярное произведение, косинус угла, корреляция Пирсона, — будут эквивалентны.

In [68]:
def get_top_words(vectors: pd.DataFrame, top: int):

    count_words = vectors.sum().sort_values(ascending=False)

    for idx, info in enumerate(zip(count_words.index, count_words.values)):

        word, count = info
        
        if idx == top:
            break
            
        print(f'{count} \t- {word}')

Ручной вариант по ключевым словам и косинусному сходству¶

Конкертно в этом случае удобно использовать векторизацию на основе бинарзиации наличия слова в тексте (Flag), то есть нам не важно какое количество раз слово встретилось в тексте, а важен сам факт его наличия. Именно поэтому все частоты слов со значением больше 1, приравниваются к 1, но делается это только на стадии поиска похожих статей. Частоты важны на моменте определения набора ключевых слов, по которым в дальнейшем будет осуществляться поиск.

In [69]:
def get_keyword_recommendations(articles: pd.DataFrame, vectors: pd.DataFrame, keywords: list) -> pd.DataFrame():
    
    initial_vector = pd.DataFrame(index=[0], columns=vectors.columns)
    initial_vector = initial_vector.fillna(0)
    initial_vector.loc[0, keywords] = 1

    vectors = vectors.mask(vectors > 1, 1)
    articles['cosine_similarity'] = cosine_similarity(vectors, initial_vector)
    
    return articles
In [70]:
get_top_words(count_vectorization, 50)
254 	- location
180 	- base
178 	- user
171 	- algorithm
171 	- datum
149 	- model
132 	- query
132 	- method
129 	- problem
104 	- propose
103 	- result
102 	- paper
98 	- approach
84 	- image
79 	- system
75 	- provide
74 	- network
73 	- new
69 	- set
67 	- application
67 	- recommendation
66 	- technique
65 	- information
65 	- real
64 	- probabilistic
62 	- social
59 	- find
58 	- rank
58 	- present
57 	- study
55 	- service
54 	- performance
52 	- database
50 	- large
49 	- region
48 	- number
47 	- use
45 	- point
44 	- time
43 	- spatial
41 	- analysis
41 	- scale
39 	- function
39 	- mobile
39 	- dataset
38 	- experiment
38 	- optimal
38 	- give
38 	- search
37 	- measure
In [71]:
keywords = [
    'location',
    'datum',
    'model',
    'base',
    'problem',
    'optimal',
    'spatial',
    'area',
    'search',
    'site',
    'data',
    'place',
    'map',
    'business',
    'store',
    'city',
    'selection',
]
In [72]:
result = get_keyword_recommendations(
    articles=data.copy(),
    vectors=flag_vectorization.copy(), 
    keywords=keywords,
)

result = result.sort_values(by='cosine_similarity', ascending=False)

result = result[[
    'title', 
    'abstract', 
    'published', 
    'citation', 
    'metric', 
    'cosine_similarity'
]]
In [73]:
result
Out[73]:
title abstract published citation metric cosine_similarity
9 Geo-spotting: mining online location-based ser... The problem of identifying the optimal locatio... 2013-08-11 193.0 2151.0 0.318397
0 Demand driven store site selection via multipl... Choosing a good location when opening a new st... 2016-10-31 26.0 617.0 0.312190
21 A scalable algorithm for maximizing range sum ... This paper investigates the MaxRS problem in s... 2012-07-01 51.0 310.0 0.196039
30 Progressive computation of the min-dist optima... This paper proposes and solves the min-dist op... 2006-09-01 36.0 312.0 0.194745
22 The optimal-location query We propose and solve the optimal-location quer... 2005-08-22 38.0 0.0 0.194745
... ... ... ... ... ... ...
39 Adaptive Nonlocal Filtering: A Fast Alternativ... Nonlinear anisotropic diffusion algorithms pro... 1999-01-01 5.0 0.0 0.000000
164 Machine learning approaches for high-resolutio... The proliferation of several machine learning ... 2011-05-23 18.0 526.0 0.000000
51 Cluster-based probability model and its applic... We develop, analyze, and apply a specific form... 1997-02-01 23.0 0.0 0.000000
50 Scale-Space and Edge Detection Using Anisotrop... A new definition of scale-space is suggested, ... 1990-07-01 1696.0 0.0 0.000000
77 An Evaluation of Statistical Approaches to Tex... This paper focuses on a comparative evaluation... 1999-04-01 495.0 0.0 0.000000

169 rows × 6 columns

In [74]:
for idx, row in result.head(5).iterrows():
    print('-----' * 30)
    print(f"Title: {row['title']}")
    print()
    print(f"Abstract: {row['abstract']}")
    print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Geo-spotting: mining online location-based services for optimal retail store placement

Abstract: The problem of identifying the optimal location for a new retail store has been the focus of past research, especially in the field of land economy, due to its importance in the success of a business. Traditional approaches to the problem have factored in demographics, revenue and aggregated human flow statistics from nearby or remote areas. However, the acquisition of relevant data is usually expensive. With the growth of location-based social networks, fine grained data describing user mobility and popularity of places has recently become attainable.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Demand driven store site selection via multiple spatial-temporal data

Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: A scalable algorithm for maximizing range sum in spatial databases

Abstract: This paper investigates the MaxRS problem in spatial databases. Given a set O of weighted points and a rectangular region r of a given size, the goal of the MaxRS problem is to find a location of r such that the sum of the weights of all the points covered by r is maximized. This problem is useful in many location-based applications such as finding the best place for a new franchise store with a limited delivery range and finding the most attractive place for a tourist with a limited reachable range. However, the problem has been studied mainly in theory, particularly, in computational geometry. The existing algorithms from the computational geometry community are in-memory algorithms which do not guarantee the scalability. In this paper, we propose a scalable external-memory algorithm (ExactMaxRS) for the MaxRS problem, which is optimal in terms of the I/O complexity. Furthermore, we propose an approximation algorithm (ApproxMaxCRS) for the MaxCRS problem that is a circle version of the MaxRS problem. We prove the correctness and optimality of the ExactMaxRS algorithm along with the approximation bound of the ApproxMaxCRS algorithm. From extensive experimental results, we show that the ExactMaxRS algorithm is two orders of magnitude faster than methods adapted from existing algorithms, and the approximation bound in practice is much better than the theoretical bound of the ApproxMaxCRS algorithm.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Progressive computation of the min-dist optimal-location query

Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: The optimal-location query

Abstract: We propose and solve the optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the optimal-location query returns a location in Q with maximum influence. Here the influence of a location l is the total weight of its RNNs, i.e. the total weight of objects in O that are closer to l than to any site in S. This new query has practical applications, but is very challenging to solve. Existing work on computing RNNs assumes a single query location, and thus cannot be used to compute optimal locations. The reason is that there are infinite candidate locations in Q. If we check a finite set of candidate locations, the result can be inaccurate, i.e. the revealed location may not have maximum influence. This paper proposes three methods that accurately compute optimal locations. The first method uses a standard R*-tree. To compute an optimal location, the method retrieves certain objects from the R*-tree and sends them as a stream to a plane-sweep algorithm, which uses a new data structure called the aSB-tree to ensure query efficiency. The second method is based on a new index structure called the OL-tree, which novelly extends the k-d-B-tree to store segmented rectangular records. The OL-tree is only of theoretical usage for it is not space efficient. The most practical approach is based on a new index structure called the Virtual OL-tree. These methods are theoretically and experimentally evaluated.
------------------------------------------------------------------------------------------------------------------------------------------------------

Автоматизированный вариант по аннотациям и косинусному сходству¶

In [75]:
def get_abstract_recommendations(articles: pd.DataFrame, vectors: pd.DataFrame, initial_vector: pd.DataFrame) -> pd.DataFrame():
    
    articles['cosine_similarity'] = cosine_similarity(vectors, initial_vector)

    articles = articles.sort_values(by='cosine_similarity', ascending=False)
    
    articles = articles[[
        'title', 
        'abstract', 
        'published', 
        'citation', 
        'metric', 
        'cosine_similarity'
    ]]
    
    return articles
In [76]:
result_flag_vectorization = get_abstract_recommendations(
    articles=data.copy(),
    vectors=flag_vectorization.copy(), 
    initial_vector=flag_vectorization.loc[[0], :],
)
In [77]:
result_count_vectorization = get_abstract_recommendations(
    articles=data.copy(),
    vectors=count_vectorization.copy(), 
    initial_vector=count_vectorization.loc[[0], :],
)
In [78]:
result_tfidf_vectorization = get_abstract_recommendations(
    articles=data.copy(),
    vectors=tfidf_vectorization.copy(), 
    initial_vector=tfidf_vectorization.loc[[0], :],
)
In [79]:
result_word2vec_vectorization = get_abstract_recommendations(
    articles=data.copy(),
    vectors=word2vec_vectorization.copy(), 
    initial_vector=word2vec_vectorization.loc[[0], :],
)
In [80]:
result_flag_vectorization
Out[80]:
title abstract published citation metric cosine_similarity
0 Demand driven store site selection via multipl... Choosing a good location when opening a new st... 2016-10-31 26.0 617.0 1.000000
79 Location-based and preference-aware recommenda... The popularity of location-based social networ... 2012-11-06 503.0 3713.0 0.248784
65 Semi-supervised document retrieval This paper proposes a new machine learning met... 2009-05-01 21.0 0.0 0.241932
6 Exploiting geographic dependencies for real es... It is traditionally a challenge for home buyer... 2014-08-24 81.0 1076.0 0.237429
31 MaxFirst for MaxBRkNN The MaxBRNN problem finds a region such that s... 2011-04-11 26.0 0.0 0.229835
... ... ... ... ... ... ...
152 Improved use of continuous attributes in C4.5 A reported weakness of C4.5 in domains with co... 1996-03-01 260.0 0.0 0.044281
164 Machine learning approaches for high-resolutio... The proliferation of several machine learning ... 2011-05-23 18.0 526.0 0.042875
12 Scikit-learn: Machine Learning in Python Scikit-learn is a Python module integrating a ... 2011-11-01 7648.0 19849.0 0.042008
55 Bilateral Filtering for Gray and Color Images Bilateral filtering smooths images while prese... 1998-01-04 793.0 0.0 0.027730
95 Comparing Top k Lists Motivated by several applications, we introduc... 2004-01-01 218.0 0.0 0.000000

169 rows × 6 columns

In [81]:
result_count_vectorization
Out[81]:
title abstract published citation metric cosine_similarity
0 Demand driven store site selection via multipl... Choosing a good location when opening a new st... 2016-10-31 26.0 617.0 1.000000
159 A taxi business intelligence system The increasing availability of large-scale loc... 2011-08-21 54.0 1288.0 0.348732
7 Optimal network location queries Given a set S of sites and a set O of weighted... 2010-11-02 21.0 199.0 0.313756
30 Progressive computation of the min-dist optima... This paper proposes and solves the min-dist op... 2006-09-01 36.0 312.0 0.305847
65 Semi-supervised document retrieval This paper proposes a new machine learning met... 2009-05-01 21.0 0.0 0.296108
... ... ... ... ... ... ...
51 Cluster-based probability model and its applic... We develop, analyze, and apply a specific form... 1997-02-01 23.0 0.0 0.020885
152 Improved use of continuous attributes in C4.5 A reported weakness of C4.5 in domains with co... 1996-03-01 260.0 0.0 0.019557
164 Machine learning approaches for high-resolutio... The proliferation of several machine learning ... 2011-05-23 18.0 526.0 0.017969
55 Bilateral Filtering for Gray and Color Images Bilateral filtering smooths images while prese... 1998-01-04 793.0 0.0 0.012093
95 Comparing Top k Lists Motivated by several applications, we introduc... 2004-01-01 218.0 0.0 0.000000

169 rows × 6 columns

In [82]:
result_tfidf_vectorization
Out[82]:
title abstract published citation metric cosine_similarity
0 Demand driven store site selection via multipl... Choosing a good location when opening a new st... 2016-10-31 26.0 617.0 1.000000
159 A taxi business intelligence system The increasing availability of large-scale loc... 2011-08-21 54.0 1288.0 0.251345
7 Optimal network location queries Given a set S of sites and a set O of weighted... 2010-11-02 21.0 199.0 0.250829
31 MaxFirst for MaxBRkNN The MaxBRNN problem finds a region such that s... 2011-04-11 26.0 0.0 0.221271
30 Progressive computation of the min-dist optima... This paper proposes and solves the min-dist op... 2006-09-01 36.0 312.0 0.217079
... ... ... ... ... ... ...
40 The estimation of the gradient of a density fu... Nonparametric density gradient estimation usin... 2006-09-01 405.0 0.0 0.011558
155 Getting from here to there: interactive planni... Planning and monitoring a trip is a common but... 2002-07-28 7.0 0.0 0.009749
152 Improved use of continuous attributes in C4.5 A reported weakness of C4.5 in domains with co... 1996-03-01 260.0 0.0 0.006445
55 Bilateral Filtering for Gray and Color Images Bilateral filtering smooths images while prese... 1998-01-04 793.0 0.0 0.003005
95 Comparing Top k Lists Motivated by several applications, we introduc... 2004-01-01 218.0 0.0 0.000000

169 rows × 6 columns

In [83]:
result_word2vec_vectorization
Out[83]:
title abstract published citation metric cosine_similarity
0 Demand driven store site selection via multipl... Choosing a good location when opening a new st... 2016-10-31 26.0 617.0 1.000000
135 Mining significant semantic locations from GPS... With the increasing deployment and use of GPS-... 2010-09-01 254.0 1734.0 0.261367
144 Location-based recommendation system using Bay... As wireless communication advances, research o... 2007-07-11 75.0 0.0 0.251195
7 Optimal network location queries Given a set S of sites and a set O of weighted... 2010-11-02 21.0 199.0 0.241299
6 Exploiting geographic dependencies for real es... It is traditionally a challenge for home buyer... 2014-08-24 81.0 1076.0 0.239075
... ... ... ... ... ... ...
55 Bilateral Filtering for Gray and Color Images Bilateral filtering smooths images while prese... 1998-01-04 793.0 0.0 0.016474
164 Machine learning approaches for high-resolutio... The proliferation of several machine learning ... 2011-05-23 18.0 526.0 0.013971
97 Provenance semirings We show that relational algebra calculations f... 2007-06-11 511.0 2427.0 0.011599
152 Improved use of continuous attributes in C4.5 A reported weakness of C4.5 in domains with co... 1996-03-01 260.0 0.0 0.010914
95 Comparing Top k Lists Motivated by several applications, we introduc... 2004-01-01 218.0 0.0 0.000000

169 rows × 6 columns

In [84]:
for idx, row in result_flag_vectorization.head(5).iterrows():
    print('-----' * 30)
    print(f"Title: {row['title']}")
    print()
    print(f"Abstract: {row['abstract']}")
    print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Demand driven store site selection via multiple spatial-temporal data

Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Location-based and preference-aware recommendation using sparse geo-social networking data

Abstract: The popularity of location-based social networks provide us with a new platform to understand users' preferences based on their location histories. In this paper, we present a location-based and preference-aware recommender system that offers a particular user a set of venues (such as restaurants) within a geospatial range with the consideration of both: 1) User preferences, which are automatically learned from her location history and 2) Social opinions, which are mined from the location histories of the local experts. This recommender system can facilitate people's travel not only near their living areas but also to a city that is new to them. As a user can only visit a limited number of locations, the user-locations matrix is very sparse, leading to a big challenge to traditional collaborative filtering-based location recommender systems. The problem becomes even more challenging when people travel to a new city. To this end, we propose a novel location recommender system, which consists of two main parts: offline modeling and online recommendation. The offline modeling part models each individual's personal preferences with a weighted category hierarchy (WCH) and infers the expertise of each user in a city with respect to different category of locations according to their location histories using an iterative learning model. The online recommendation part selects candidate local experts in a geospatial range that matches the user's preferences using a preference-aware candidate selection algorithm and then infers a score of the candidate locations based on the opinions of the selected local experts. Finally, the top-k ranked locations are returned as the recommendations for the user. We evaluated our system with a large-scale real dataset collected from Foursquare. The results confirm that our method offers more effective recommendations than baselines, while having a good efficiency of providing location recommendations.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Semi-supervised document retrieval

Abstract: This paper proposes a new machine learning method for constructing ranking models in document retrieval. The method, which is referred to as SSRank, aims to use the advantages of both the traditional Information Retrieval (IR) methods and the supervised learning methods for IR proposed recently. The advantages include the use of limited amount of labeled data and rich model representation. To do so, the method adopts a semi-supervised learning framework in ranking model construction. Specifically, given a small number of labeled documents with respect to some queries, the method effectively labels the unlabeled documents for the queries. It then uses all the labeled data to train a machine learning model (in our case, Neural Network). In the data labeling, the method also makes use of a traditional IR model (in our case, BM25). A stopping criterion based on machine learning theory is given for the data labeling process. Experimental results on three benchmark datasets and one web search dataset indicate that SSRank consistently and almost always significantly outperforms the baseline methods (unsupervised and supervised learning methods), given the same amount of labeled data. This is because SSRank can effectively leverage the use of unlabeled data in learning.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Exploiting geographic dependencies for real estate appraisal: a mutual perspective of ranking and clustering

Abstract: It is traditionally a challenge for home buyers to understand, compare and contrast the investment values of real estates. While a number of estate appraisal methods have been developed to value real property, the performances of these methods have been limited by the traditional data sources for estate appraisal. However, with the development of new ways of collecting estate-related mobile data, there is a potential to leverage geographic dependencies of estates for enhancing estate appraisal. Indeed, the geographic dependencies of the value of an estate can be from the characteristics of its own neighborhood (individual), the values of its nearby estates (peer), and the prosperity of the affiliated latent business area (zone). To this end, in this paper, we propose a geographic method, named ClusRanking, for estate appraisal by leveraging the mutual enforcement of ranking and clustering power. ClusRanking is able to exploit geographic individual, peer, and zone dependencies in a probabilistic ranking model. Specifically, we first extract the geographic utility of estates from geography data, estimate the neighborhood popularity of estates by mining taxicab trajectory data, and model the influence of latent business areas via ClusRanking. Also, we use a linear model to fuse these three influential factors and predict estate investment values. Moreover, we simultaneously consider individual, peer and zone dependencies, and derive an estate-specific ranking likelihood as the objective function. Finally, we conduct a comprehensive evaluation with real-world estate related data, and the experimental results demonstrate the effectiveness of our method.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: MaxFirst for MaxBRkNN

Abstract: The MaxBRNN problem finds a region such that setting up a new service site within this region would guarantee the maximum number of customers by proximity. This problem assumes that each customer only uses the service provided by his/her nearest service site. However, in reality, a customer tends to go to his/her k nearest service sites. To handle this, MaxBRNN can be extended to the MaxBRkNN problem which finds an optimal region such that setting up a service site in this region guarantees the maximum number of customers who would consider the site as one of their k nearest service locations. We further generalize the MaxBRkNN problem to reflect the real world scenario where customers may have different preferences for different service sites, and at the same time, service sites may have preferred targeted customers. In this paper, we present an efficient solution called MaxFirst to solve this generalized MaxBRkNN problem. The algorithm works by partitioning the space into quadrants and searches only in those quadrants that potentially contain an optimal region. During the space partitioning, we compute the upper and lower bounds of the size of a quadrant's BRkNN, and use these bounds to prune the unpromising quadrants. Experiment results show that MaxFirst can be two to three orders of magnitude faster than the state-of-the-art algorithm.
------------------------------------------------------------------------------------------------------------------------------------------------------
In [85]:
for idx, row in result_count_vectorization.head(5).iterrows():
    print('-----' * 30)
    print(f"Title: {row['title']}")
    print()
    print(f"Abstract: {row['abstract']}")
    print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Demand driven store site selection via multiple spatial-temporal data

Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: A taxi business intelligence system

Abstract: The increasing availability of large-scale location traces creates unprecedent opportunities to change the paradigm for knowledge discovery in transportation systems. A particularly promising area is to extract useful business intelligence, which can be used as guidance for reducing inefficiencies in energy consumption of transportation sectors, improving customer experiences, and increasing business performances. However, extracting business intelligence from location traces is not a trivial task. Conventional data analytic tools are usually not customized for handling large, complex, dynamic, and distributed nature of location traces. To that end, we develop a taxi business intelligence system to explore the massive taxi location traces from different business perspectives with various data mining functions. Since we implement the system using the real-world taxi GPS data, this demonstration will help taxi companies to improve their business performances by understanding the behaviors of both drivers and customers. In addition, several identified technical challenges also motivate data mining people to develop more sophisticate techniques in the future.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Optimal network location queries

Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Progressive computation of the min-dist optimal-location query

Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Semi-supervised document retrieval

Abstract: This paper proposes a new machine learning method for constructing ranking models in document retrieval. The method, which is referred to as SSRank, aims to use the advantages of both the traditional Information Retrieval (IR) methods and the supervised learning methods for IR proposed recently. The advantages include the use of limited amount of labeled data and rich model representation. To do so, the method adopts a semi-supervised learning framework in ranking model construction. Specifically, given a small number of labeled documents with respect to some queries, the method effectively labels the unlabeled documents for the queries. It then uses all the labeled data to train a machine learning model (in our case, Neural Network). In the data labeling, the method also makes use of a traditional IR model (in our case, BM25). A stopping criterion based on machine learning theory is given for the data labeling process. Experimental results on three benchmark datasets and one web search dataset indicate that SSRank consistently and almost always significantly outperforms the baseline methods (unsupervised and supervised learning methods), given the same amount of labeled data. This is because SSRank can effectively leverage the use of unlabeled data in learning.
------------------------------------------------------------------------------------------------------------------------------------------------------
In [86]:
for idx, row in result_tfidf_vectorization.head(5).iterrows():
    print('-----' * 30)
    print(f"Title: {row['title']}")
    print()
    print(f"Abstract: {row['abstract']}")
    print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Demand driven store site selection via multiple spatial-temporal data

Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: A taxi business intelligence system

Abstract: The increasing availability of large-scale location traces creates unprecedent opportunities to change the paradigm for knowledge discovery in transportation systems. A particularly promising area is to extract useful business intelligence, which can be used as guidance for reducing inefficiencies in energy consumption of transportation sectors, improving customer experiences, and increasing business performances. However, extracting business intelligence from location traces is not a trivial task. Conventional data analytic tools are usually not customized for handling large, complex, dynamic, and distributed nature of location traces. To that end, we develop a taxi business intelligence system to explore the massive taxi location traces from different business perspectives with various data mining functions. Since we implement the system using the real-world taxi GPS data, this demonstration will help taxi companies to improve their business performances by understanding the behaviors of both drivers and customers. In addition, several identified technical challenges also motivate data mining people to develop more sophisticate techniques in the future.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Optimal network location queries

Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: MaxFirst for MaxBRkNN

Abstract: The MaxBRNN problem finds a region such that setting up a new service site within this region would guarantee the maximum number of customers by proximity. This problem assumes that each customer only uses the service provided by his/her nearest service site. However, in reality, a customer tends to go to his/her k nearest service sites. To handle this, MaxBRNN can be extended to the MaxBRkNN problem which finds an optimal region such that setting up a service site in this region guarantees the maximum number of customers who would consider the site as one of their k nearest service locations. We further generalize the MaxBRkNN problem to reflect the real world scenario where customers may have different preferences for different service sites, and at the same time, service sites may have preferred targeted customers. In this paper, we present an efficient solution called MaxFirst to solve this generalized MaxBRkNN problem. The algorithm works by partitioning the space into quadrants and searches only in those quadrants that potentially contain an optimal region. During the space partitioning, we compute the upper and lower bounds of the size of a quadrant's BRkNN, and use these bounds to prune the unpromising quadrants. Experiment results show that MaxFirst can be two to three orders of magnitude faster than the state-of-the-art algorithm.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Progressive computation of the min-dist optimal-location query

Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated.
------------------------------------------------------------------------------------------------------------------------------------------------------
In [87]:
for idx, row in result_word2vec_vectorization.head(5).iterrows():
    print('-----' * 30)
    print(f"Title: {row['title']}")
    print()
    print(f"Abstract: {row['abstract']}")
    print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Demand driven store site selection via multiple spatial-temporal data

Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Mining significant semantic locations from GPS data

Abstract: With the increasing deployment and use of GPS-enabled devices, massive amounts of GPS data are becoming available. We propose a general framework for the mining of semantically meaningful, significant locations, e.g., shopping malls and restaurants, from such data.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Location-based recommendation system using Bayesian user's preference model in mobile devices

Abstract: As wireless communication advances, research on location-based services using mobile devices has attracted interest, which provides information and services related to user's physical location. As increasing information and services, it becomes difficult to find a proper service that reflects the individual preference at proper time. Due to the small screen of mobile devices and insufficiency of resources, personalized services and convenient user interface might be useful. In this paper, we propose a map-based personalized recommendation system which reflects user's preference modeled by Bayesian Networks (BN). The structure of BN is built by an expert while the parameter is learned from the dataset. The proposed system collects context information, location, time, weather, and user request from the mobile device and infers the most preferred item to provide an appropriate service by displaying onto the mini map.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Optimal network location queries

Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries.
------------------------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------------------------
Title: Exploiting geographic dependencies for real estate appraisal: a mutual perspective of ranking and clustering

Abstract: It is traditionally a challenge for home buyers to understand, compare and contrast the investment values of real estates. While a number of estate appraisal methods have been developed to value real property, the performances of these methods have been limited by the traditional data sources for estate appraisal. However, with the development of new ways of collecting estate-related mobile data, there is a potential to leverage geographic dependencies of estates for enhancing estate appraisal. Indeed, the geographic dependencies of the value of an estate can be from the characteristics of its own neighborhood (individual), the values of its nearby estates (peer), and the prosperity of the affiliated latent business area (zone). To this end, in this paper, we propose a geographic method, named ClusRanking, for estate appraisal by leveraging the mutual enforcement of ranking and clustering power. ClusRanking is able to exploit geographic individual, peer, and zone dependencies in a probabilistic ranking model. Specifically, we first extract the geographic utility of estates from geography data, estimate the neighborhood popularity of estates by mining taxicab trajectory data, and model the influence of latent business areas via ClusRanking. Also, we use a linear model to fuse these three influential factors and predict estate investment values. Moreover, we simultaneously consider individual, peer and zone dependencies, and derive an estate-specific ranking likelihood as the objective function. Finally, we conduct a comprehensive evaluation with real-world estate related data, and the experimental results demonstrate the effectiveness of our method.
------------------------------------------------------------------------------------------------------------------------------------------------------

Сравнение рекомендаций¶

In [88]:
recommendations = {
    'Custom': result,
    'Flag': result_flag_vectorization,
    'Count': result_count_vectorization,
    'Tf-Idf': result_tfidf_vectorization,
    'Word2Vec': result_word2vec_vectorization,
}
In [89]:
for method, recommandation in recommendations.items():
    data = pd.concat([data, recommandation[['cosine_similarity']].rename(columns={'cosine_similarity': f'{method} method'})], axis=1)
In [90]:
data
Out[90]:
url deapth title authors source number and pages doi published citation metric abstract references process_abstract Custom method Flag method Count method Tf-Idf method Word2Vec method
0 https://dl.acm.org/doi/10.1145/2996913.2996996 0 Demand driven store site selection via multipl... [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... SIGSPACIAL '16: Proceedings of the 24th ACM SI... Article No.: 40, Pages 1 - 10 https://doi.org/10.1145/2996913.2996996 2016-10-31 26.0 617.0 Choosing a good location when opening a new st... [https://dl.acm.org/doi/10.1016/S0305-0548(01)... choose good location open new store crucial fu... 0.312190 1.000000 1.000000 1.000000 1.000000
1 https://dl.acm.org/doi/10.1145/2996913.2996996 1 The generalized maximal covering location problem [Oded Berman, Dmitry Krass] Computers and Operations Research NaN https://doi.org/10.1016/S0305-0548(01)00079-X 2002-05-01 34.0 0.0 We consider a generalization of the maximal co... [] consider generalization maximal cover location... 0.146254 0.089562 0.150723 0.064708 0.170892
2 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Random Forests [Leo Breiman] Machine Learning NaN https://doi.org/10.1023/A:1010933404324 2001-10-01 9828.0 0.0 Random forests are a combination of tree predi... [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... random forest combination tree predictor tree ... 0.033315 0.081604 0.048373 0.034469 0.080029
3 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Efficient algorithms for optimal location quer... [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... NaN https://doi.org/10.1145/2588555.2612172 2014-06-18 47.0 790.0 In this paper, we study the optimal location q... [https://dl.acm.org/doi/10.14778/2350229.23502... paper study optimal location query problem bas... 0.126302 0.180469 0.203188 0.093193 0.210058
4 https://dl.acm.org/doi/10.1145/2996913.2996996 1 Mean Shift: A Robust Approach Toward Feature S... [Dorin Comaniciu, Peter Meer] IEEE Transactions on Pattern Analysis and Mach... NaN https://doi.org/10.1109/34.1000236 2002-05-01 2062.0 0.0 A general nonparametric technique is proposed ... [https://dl.acm.org/doi/10.1007/BF00128233, ht... general nonparametric technique propose analys... 0.057567 0.094007 0.082959 0.038648 0.122549
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
164 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Machine learning approaches for high-resolutio... [Ranga Raju Vatsavai, Eddie Bright, Chandola V... COM.Geo '11: Proceedings of the 2nd Internatio... Article No.: 11, Pages 1 - 10 https://doi.org/10.1145/1999320.1999331 2011-05-23 18.0 526.0 The proliferation of several machine learning ... [https://dl.acm.org/doi/10.5555/1191551.119179... proliferation machine learning approach make d... 0.000000 0.042875 0.017969 0.015740 0.013971
165 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Geographical topic discovery and comparison [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... WWW '11: Proceedings of the 20th international... NaN https://doi.org/10.1145/1963405.1963443 2011-03-28 232.0 1642.0 This paper studies the problem of discovering ... [https://dl.acm.org/doi/10.5555/944919.944937,... paper study problem discover compare geographi... 0.080845 0.132020 0.194915 0.082114 0.126442
166 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Driving with knowledge from the physical world [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] KDD '11: Proceedings of the 17th ACM SIGKDD in... NaN https://doi.org/10.1145/2020408.2020462 2011-08-21 641.0 2908.0 This paper presents a Cloud-based system compu... [https://dl.acm.org/doi/10.1016/j.eswa.2008.07... paper present cloud base system computing cust... 0.109150 0.178240 0.113181 0.076104 0.132803
167 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Where to find my next passenger [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030128 2011-09-17 276.0 2024.0 We present a recommender for taxi drivers and ... [https://dl.acm.org/doi/10.1145/304182.304187,... present recommender taxi driver people expect ... 0.106132 0.101100 0.123165 0.043080 0.098161
168 https://dl.acm.org/doi/10.1145/2487575.2487616 2 Urban computing with taxicabs [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] UbiComp '11: Proceedings of the 13th internati... NaN https://doi.org/10.1145/2030112.2030126 2011-09-17 413.0 3122.0 Urban computing for city planning is one of th... [https://dl.acm.org/doi/10.5555/645484.656550,... urban computing city planning significant appl... 0.107280 0.131390 0.085192 0.048406 0.077540

169 rows × 18 columns

In [91]:
data['Total score'] = data[[
    'Custom method', 
    'Flag method', 
    'Count method',
    'Tf-Idf method', 
    'Word2Vec method',
]].sum(axis=1)
In [92]:
data.sort_values(by='Total score', ascending=False)[[
    'title',
    'published',
    'citation',
    'metric',
    'abstract',
    'Total score',
]].head(15)
Out[92]:
title published citation metric abstract Total score
0 Demand driven store site selection via multipl... 2016-10-31 26.0 617.0 Choosing a good location when opening a new st... 4.312190
7 Optimal network location queries 2010-11-02 21.0 199.0 Given a set S of sites and a set O of weighted... 1.165115
30 Progressive computation of the min-dist optima... 2006-09-01 36.0 312.0 This paper proposes and solves the min-dist op... 1.148950
159 A taxi business intelligence system 2011-08-21 54.0 1288.0 The increasing availability of large-scale loc... 1.148390
31 MaxFirst for MaxBRkNN 2011-04-11 26.0 0.0 The MaxBRNN problem finds a region such that s... 1.039957
79 Location-based and preference-aware recommenda... 2012-11-06 503.0 3713.0 The popularity of location-based social networ... 0.993410
65 Semi-supervised document retrieval 2009-05-01 21.0 0.0 This paper proposes a new machine learning met... 0.965735
13 Trade area analysis using user generated mobil... 2013-05-13 57.0 693.0 In this paper, we illustrate how User Generate... 0.961256
22 The optimal-location query 2005-08-22 38.0 0.0 We propose and solve the optimal-location quer... 0.956050
125 Privacy-friendly business models for location-... 2011-08-01 7.0 0.0 This paper presents a theoretical model to ana... 0.954832
9 Geo-spotting: mining online location-based ser... 2013-08-11 193.0 2151.0 The problem of identifying the optimal locatio... 0.952292
6 Exploiting geographic dependencies for real es... 2014-08-24 81.0 1076.0 It is traditionally a challenge for home buyer... 0.894539
29 Efficient methods for finding influential loca... 2011-10-24 30.0 207.0 Given a set S of servers and a set C of client... 0.890261
80 Learning to rank using gradient descent 2005-08-07 1734.0 7323.0 We investigate using gradient descent methods ... 0.874327
135 Mining significant semantic locations from GPS... 2010-09-01 254.0 1734.0 With the increasing deployment and use of GPS-... 0.866132
In [93]:
data.to_excel('analysis_results/analysis_articles.xlsx', index=False)

Визуализация векторов с помощью PCA¶

In [94]:
def get_pca_visualization(dataframe: pd.DataFrame, method: str):

    fig = px.scatter_3d(
        dataframe, 
        x='Component 0', 
        y='Component 1', 
        z='Component 2',
        hover_data=['title', 'published', 'citation', 'metric', 'Total score'],
        size='Total score processed',
        size_max=40,
        color='Total score processed',
        color_continuous_scale=px.colors.diverging.Spectral_r,
        height=800, 
        width=1100,
        title=f'{method} vectorization',
    )
    
    fig.show()
In [95]:
vectors = {
    'Flag': flag_vectorization,
    'Count': count_vectorization,
    'Tf-Idf': tfidf_vectorization,
    'Word2Vec': word2vec_vectorization,
}
In [96]:
decompositions = dict()

pca = PCA(n_components=3, random_state=42)

for method, vector in vectors.items():
    decomposition = pca.fit_transform(vector)
    decomposition = pd.DataFrame(decomposition)
    decomposition.columns = [f'Component {column}' for column in decomposition.columns]

    decomposition = pd.concat(
        [
            data[[
                'title',
                'published',
                'citation',
                'metric',
                'Total score',
            ]],           
            decomposition,
        ],
        axis=1,
    )

    decomposition['Total score processed'] = decomposition['Total score'].mask(
        decomposition['Total score'] > decomposition['Total score'].quantile(0.992),
        decomposition['Total score'].quantile(0.992) * 1.05
    )
    
    decompositions[method] = decomposition
In [97]:
for method, decomposition in decompositions.items():
    get_pca_visualization(decomposition, method)
In [ ]: